1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/17/99    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/rep.h"
18 #include "unicode/uniset.h"
19 #include "rbt_pars.h"
20 #include "rbt_data.h"
21 #include "rbt_rule.h"
22 #include "rbt.h"
23 #include "mutex.h"
24 #include "umutex.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
29 
30 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
31 static Replaceable *gLockedText = NULL;
32 
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)33 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
34                                          UTransDirection direction,
35                                          UParseError& parseError,
36                                          UErrorCode& status) {
37     fData = 0;
38     isDataOwned = TRUE;
39     if (U_FAILURE(status)) {
40         return;
41     }
42 
43     TransliteratorParser parser(status);
44     parser.parse(rules, direction, parseError, status);
45     if (U_FAILURE(status)) {
46         return;
47     }
48 
49     if (parser.idBlockVector.size() != 0 ||
50         parser.compoundFilter != NULL ||
51         parser.dataVector.size() == 0) {
52         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
53         return;
54     }
55 
56     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
57     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
58 }
59 
60 /**
61  * Constructs a new transliterator from the given rules.
62  * @param id            the id for the transliterator.
63  * @param rules         rules, separated by ';'
64  * @param direction     either FORWARD or REVERSE.
65  * @param adoptedFilter the filter for this transliterator.
66  * @param parseError    Struct to recieve information on position
67  *                      of error if an error is encountered
68  * @param status        Output param set to success/failure code.
69  * @exception IllegalArgumentException if rules are malformed
70  * or direction is invalid.
71  */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)72 RuleBasedTransliterator::RuleBasedTransliterator(
73                             const UnicodeString& id,
74                             const UnicodeString& rules,
75                             UTransDirection direction,
76                             UnicodeFilter* adoptedFilter,
77                             UParseError& parseError,
78                             UErrorCode& status) :
79     Transliterator(id, adoptedFilter) {
80     _construct(rules, direction,parseError,status);
81 }
82 
83 /**
84  * Constructs a new transliterator from the given rules.
85  * @param id            the id for the transliterator.
86  * @param rules         rules, separated by ';'
87  * @param direction     either FORWARD or REVERSE.
88  * @param adoptedFilter the filter for this transliterator.
89  * @param status        Output param set to success/failure code.
90  * @exception IllegalArgumentException if rules are malformed
91  * or direction is invalid.
92  */
93 /*RuleBasedTransliterator::RuleBasedTransliterator(
94                             const UnicodeString& id,
95                             const UnicodeString& rules,
96                             UTransDirection direction,
97                             UnicodeFilter* adoptedFilter,
98                             UErrorCode& status) :
99     Transliterator(id, adoptedFilter) {
100     UParseError parseError;
101     _construct(rules, direction,parseError, status);
102 }*/
103 
104 /**
105  * Covenience constructor with no filter.
106  */
107 /*RuleBasedTransliterator::RuleBasedTransliterator(
108                             const UnicodeString& id,
109                             const UnicodeString& rules,
110                             UTransDirection direction,
111                             UErrorCode& status) :
112     Transliterator(id, 0) {
113     UParseError parseError;
114     _construct(rules, direction,parseError, status);
115 }*/
116 
117 /**
118  * Covenience constructor with no filter and FORWARD direction.
119  */
120 /*RuleBasedTransliterator::RuleBasedTransliterator(
121                             const UnicodeString& id,
122                             const UnicodeString& rules,
123                             UErrorCode& status) :
124     Transliterator(id, 0) {
125     UParseError parseError;
126     _construct(rules, UTRANS_FORWARD, parseError, status);
127 }*/
128 
129 /**
130  * Covenience constructor with FORWARD direction.
131  */
132 /*RuleBasedTransliterator::RuleBasedTransliterator(
133                             const UnicodeString& id,
134                             const UnicodeString& rules,
135                             UnicodeFilter* adoptedFilter,
136                             UErrorCode& status) :
137     Transliterator(id, adoptedFilter) {
138     UParseError parseError;
139     _construct(rules, UTRANS_FORWARD,parseError, status);
140 }*/
141 
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)142 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
143                                  const TransliterationRuleData* theData,
144                                  UnicodeFilter* adoptedFilter) :
145     Transliterator(id, adoptedFilter),
146     fData((TransliterationRuleData*)theData), // cast away const
147     isDataOwned(FALSE) {
148     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
149 }
150 
151 /**
152  * Internal constructor.
153  */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)154 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
155                                                  TransliterationRuleData* theData,
156                                                  UBool isDataAdopted) :
157     Transliterator(id, 0),
158     fData(theData),
159     isDataOwned(isDataAdopted) {
160     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
161 }
162 
163 /**
164  * Copy constructor.
165  */
RuleBasedTransliterator(const RuleBasedTransliterator & other)166 RuleBasedTransliterator::RuleBasedTransliterator(
167         const RuleBasedTransliterator& other) :
168     Transliterator(other), fData(other.fData),
169     isDataOwned(other.isDataOwned) {
170 
171     // The data object may or may not be owned.  If it is not owned we
172     // share it; it is invariant.  If it is owned, it's still
173     // invariant, but we need to copy it to prevent double-deletion.
174     // If this becomes a performance issue (if people do a lot of RBT
175     // copying -- unlikely) we can reference count the data object.
176 
177     // Only do a deep copy if this is owned data, that is, data that
178     // will be later deleted.  System transliterators contain
179     // non-owned data.
180     if (isDataOwned) {
181         fData = new TransliterationRuleData(*other.fData);
182     }
183 }
184 
185 /**
186  * Destructor.
187  */
~RuleBasedTransliterator()188 RuleBasedTransliterator::~RuleBasedTransliterator() {
189     // Delete the data object only if we own it.
190     if (isDataOwned) {
191         delete fData;
192     }
193 }
194 
195 Transliterator* // Covariant return NOT ALLOWED (for portability)
clone(void) const196 RuleBasedTransliterator::clone(void) const {
197     return new RuleBasedTransliterator(*this);
198 }
199 
200 /**
201  * Implements {@link Transliterator#handleTransliterate}.
202  */
203 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const204 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
205                                              UBool isIncremental) const {
206     /* We keep contextStart and contextLimit fixed the entire time,
207      * relative to the text -- contextLimit may move numerically if
208      * text is inserted or removed.  The start offset moves toward
209      * limit, with replacements happening under it.
210      *
211      * Example: rules 1. ab>x|y
212      *                2. yc>z
213      *
214      * |eabcd   begin - no match, advance start
215      * e|abcd   match rule 1 - change text & adjust start
216      * ex|ycd   match rule 2 - change text & adjust start
217      * exz|d    no match, advance start
218      * exzd|    done
219      */
220 
221     /* A rule like
222      *   a>b|a
223      * creates an infinite loop. To prevent that, we put an arbitrary
224      * limit on the number of iterations that we take, one that is
225      * high enough that any reasonable rules are ok, but low enough to
226      * prevent a server from hanging.  The limit is 16 times the
227      * number of characters n, unless n is so large that 16n exceeds a
228      * uint32_t.
229      */
230     uint32_t loopCount = 0;
231     uint32_t loopLimit = index.limit - index.start;
232     if (loopLimit >= 0x10000000) {
233         loopLimit = 0xFFFFFFFF;
234     } else {
235         loopLimit <<= 4;
236     }
237 
238     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
239     //   operations must be prevented.
240     // A Complication: compound transliterators can result in recursive entries to this
241     //   function, sometimes with different "This" objects, always with the same text.
242     //   Double-locking must be prevented in these cases.
243     //
244 
245     UBool    lockedMutexAtThisLevel = FALSE;
246 
247     // Test whether this request is operating on the same text string as
248     //   some other transliteration that is still in progress and holding the
249     //   transliteration mutex.  If so, do not lock the transliteration
250     //    mutex again.
251     //
252     //  gLockedText variable is protected by the global ICU mutex.
253     //  Shared RBT data protected by transliteratorDataMutex.
254     //
255     // TODO(andy): Need a better scheme for handling this.
256     UBool needToLock;
257     {
258         Mutex m;
259         needToLock = (&text != gLockedText);
260     }
261     if (needToLock) {
262         umtx_lock(&transliteratorDataMutex);  // Contention, longish waits possible here.
263         Mutex m;
264         gLockedText = &text;
265         lockedMutexAtThisLevel = TRUE;
266     }
267 
268     // Check to make sure we don't dereference a null pointer.
269     if (fData != NULL) {
270 	    while (index.start < index.limit &&
271 	           loopCount <= loopLimit &&
272 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
273 	        ++loopCount;
274 	    }
275     }
276     if (lockedMutexAtThisLevel) {
277         {
278             Mutex m;
279             gLockedText = NULL;
280         }
281         umtx_unlock(&transliteratorDataMutex);
282     }
283 }
284 
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const285 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
286                                                 UBool escapeUnprintable) const {
287     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
288 }
289 
290 /**
291  * Implement Transliterator framework
292  */
handleGetSourceSet(UnicodeSet & result) const293 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
294     fData->ruleSet.getSourceTargetSet(result, FALSE);
295 }
296 
297 /**
298  * Override Transliterator framework
299  */
getTargetSet(UnicodeSet & result) const300 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
301     return fData->ruleSet.getSourceTargetSet(result, TRUE);
302 }
303 
304 U_NAMESPACE_END
305 
306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
307