1 /*
2 **********************************************************************
3 *   Copyright (C) 1999-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/17/99    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/rep.h"
16 #include "unicode/uniset.h"
17 #include "rbt_pars.h"
18 #include "rbt_data.h"
19 #include "rbt_rule.h"
20 #include "rbt.h"
21 #include "umutex.h"
22 
23 U_NAMESPACE_BEGIN
24 
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
26 
27 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
28 static Replaceable *gLockedText = NULL;
29 
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)30 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
31                                          UTransDirection direction,
32                                          UParseError& parseError,
33                                          UErrorCode& status) {
34     fData = 0;
35     isDataOwned = TRUE;
36     if (U_FAILURE(status)) {
37         return;
38     }
39 
40     TransliteratorParser parser(status);
41     parser.parse(rules, direction, parseError, status);
42     if (U_FAILURE(status)) {
43         return;
44     }
45 
46     if (parser.idBlockVector.size() != 0 ||
47         parser.compoundFilter != NULL ||
48         parser.dataVector.size() == 0) {
49         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
50         return;
51     }
52 
53     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
54     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
55 }
56 
57 /**
58  * Constructs a new transliterator from the given rules.
59  * @param id            the id for the transliterator.
60  * @param rules         rules, separated by ';'
61  * @param direction     either FORWARD or REVERSE.
62  * @param adoptedFilter the filter for this transliterator.
63  * @param parseError    Struct to recieve information on position
64  *                      of error if an error is encountered
65  * @param status        Output param set to success/failure code.
66  * @exception IllegalArgumentException if rules are malformed
67  * or direction is invalid.
68  */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)69 RuleBasedTransliterator::RuleBasedTransliterator(
70                             const UnicodeString& id,
71                             const UnicodeString& rules,
72                             UTransDirection direction,
73                             UnicodeFilter* adoptedFilter,
74                             UParseError& parseError,
75                             UErrorCode& status) :
76     Transliterator(id, adoptedFilter) {
77     _construct(rules, direction,parseError,status);
78 }
79 
80 /**
81  * Constructs a new transliterator from the given rules.
82  * @param id            the id for the transliterator.
83  * @param rules         rules, separated by ';'
84  * @param direction     either FORWARD or REVERSE.
85  * @param adoptedFilter the filter for this transliterator.
86  * @param status        Output param set to success/failure code.
87  * @exception IllegalArgumentException if rules are malformed
88  * or direction is invalid.
89  */
90 /*RuleBasedTransliterator::RuleBasedTransliterator(
91                             const UnicodeString& id,
92                             const UnicodeString& rules,
93                             UTransDirection direction,
94                             UnicodeFilter* adoptedFilter,
95                             UErrorCode& status) :
96     Transliterator(id, adoptedFilter) {
97     UParseError parseError;
98     _construct(rules, direction,parseError, status);
99 }*/
100 
101 /**
102  * Covenience constructor with no filter.
103  */
104 /*RuleBasedTransliterator::RuleBasedTransliterator(
105                             const UnicodeString& id,
106                             const UnicodeString& rules,
107                             UTransDirection direction,
108                             UErrorCode& status) :
109     Transliterator(id, 0) {
110     UParseError parseError;
111     _construct(rules, direction,parseError, status);
112 }*/
113 
114 /**
115  * Covenience constructor with no filter and FORWARD direction.
116  */
117 /*RuleBasedTransliterator::RuleBasedTransliterator(
118                             const UnicodeString& id,
119                             const UnicodeString& rules,
120                             UErrorCode& status) :
121     Transliterator(id, 0) {
122     UParseError parseError;
123     _construct(rules, UTRANS_FORWARD, parseError, status);
124 }*/
125 
126 /**
127  * Covenience constructor with FORWARD direction.
128  */
129 /*RuleBasedTransliterator::RuleBasedTransliterator(
130                             const UnicodeString& id,
131                             const UnicodeString& rules,
132                             UnicodeFilter* adoptedFilter,
133                             UErrorCode& status) :
134     Transliterator(id, adoptedFilter) {
135     UParseError parseError;
136     _construct(rules, UTRANS_FORWARD,parseError, status);
137 }*/
138 
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
140                                  const TransliterationRuleData* theData,
141                                  UnicodeFilter* adoptedFilter) :
142     Transliterator(id, adoptedFilter),
143     fData((TransliterationRuleData*)theData), // cast away const
144     isDataOwned(FALSE) {
145     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
146 }
147 
148 /**
149  * Internal constructor.
150  */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
152                                                  TransliterationRuleData* theData,
153                                                  UBool isDataAdopted) :
154     Transliterator(id, 0),
155     fData(theData),
156     isDataOwned(isDataAdopted) {
157     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
158 }
159 
160 /**
161  * Copy constructor.
162  */
RuleBasedTransliterator(const RuleBasedTransliterator & other)163 RuleBasedTransliterator::RuleBasedTransliterator(
164         const RuleBasedTransliterator& other) :
165     Transliterator(other), fData(other.fData),
166     isDataOwned(other.isDataOwned) {
167 
168     // The data object may or may not be owned.  If it is not owned we
169     // share it; it is invariant.  If it is owned, it's still
170     // invariant, but we need to copy it to prevent double-deletion.
171     // If this becomes a performance issue (if people do a lot of RBT
172     // copying -- unlikely) we can reference count the data object.
173 
174     // Only do a deep copy if this is owned data, that is, data that
175     // will be later deleted.  System transliterators contain
176     // non-owned data.
177     if (isDataOwned) {
178         fData = new TransliterationRuleData(*other.fData);
179     }
180 }
181 
182 /**
183  * Destructor.
184  */
~RuleBasedTransliterator()185 RuleBasedTransliterator::~RuleBasedTransliterator() {
186     // Delete the data object only if we own it.
187     if (isDataOwned) {
188         delete fData;
189     }
190 }
191 
192 Transliterator* // Covariant return NOT ALLOWED (for portability)
clone(void) const193 RuleBasedTransliterator::clone(void) const {
194     return new RuleBasedTransliterator(*this);
195 }
196 
197 /**
198  * Implements {@link Transliterator#handleTransliterate}.
199  */
200 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
202                                              UBool isIncremental) const {
203     /* We keep contextStart and contextLimit fixed the entire time,
204      * relative to the text -- contextLimit may move numerically if
205      * text is inserted or removed.  The start offset moves toward
206      * limit, with replacements happening under it.
207      *
208      * Example: rules 1. ab>x|y
209      *                2. yc>z
210      *
211      * |eabcd   begin - no match, advance start
212      * e|abcd   match rule 1 - change text & adjust start
213      * ex|ycd   match rule 2 - change text & adjust start
214      * exz|d    no match, advance start
215      * exzd|    done
216      */
217 
218     /* A rule like
219      *   a>b|a
220      * creates an infinite loop. To prevent that, we put an arbitrary
221      * limit on the number of iterations that we take, one that is
222      * high enough that any reasonable rules are ok, but low enough to
223      * prevent a server from hanging.  The limit is 16 times the
224      * number of characters n, unless n is so large that 16n exceeds a
225      * uint32_t.
226      */
227     uint32_t loopCount = 0;
228     uint32_t loopLimit = index.limit - index.start;
229     if (loopLimit >= 0x10000000) {
230         loopLimit = 0xFFFFFFFF;
231     } else {
232         loopLimit <<= 4;
233     }
234 
235     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
236     //   operations must be prevented.
237     // A Complication: compound transliterators can result in recursive entries to this
238     //   function, sometimes with different "This" objects, always with the same text.
239     //   Double-locking must be prevented in these cases.
240     //
241 
242     // If the transliteration data is exclusively owned by this transliterator object,
243     //   we don't need to do any locking.  No sharing between transliterators is possible,
244     //   so no concurrent access from multiple threads is possible.
245     UBool    lockedMutexAtThisLevel = FALSE;
246     if (isDataOwned == FALSE) {
247         // Test whether this request is operating on the same text string as some
248         //   some other transliteration that is still in progress and holding the
249         //   transliteration mutex.  If so, do not lock the transliteration
250         //    mutex again.
251         // TODO(andy): Need a better scheme for handling this.
252         UBool needToLock;
253         umtx_lock(NULL);
254         needToLock = (&text != gLockedText);
255         umtx_unlock(NULL);
256         if (needToLock) {
257             umtx_lock(&transliteratorDataMutex);
258             gLockedText = &text;
259             lockedMutexAtThisLevel = TRUE;
260         }
261     }
262 
263     // Check to make sure we don't dereference a null pointer.
264     if (fData != NULL) {
265 	    while (index.start < index.limit &&
266 	           loopCount <= loopLimit &&
267 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
268 	        ++loopCount;
269 	    }
270     }
271     if (lockedMutexAtThisLevel) {
272         gLockedText = NULL;
273         umtx_unlock(&transliteratorDataMutex);
274     }
275 }
276 
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const277 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
278                                                 UBool escapeUnprintable) const {
279     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
280 }
281 
282 /**
283  * Implement Transliterator framework
284  */
handleGetSourceSet(UnicodeSet & result) const285 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
286     fData->ruleSet.getSourceTargetSet(result, FALSE);
287 }
288 
289 /**
290  * Override Transliterator framework
291  */
getTargetSet(UnicodeSet & result) const292 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
293     return fData->ruleSet.getSourceTargetSet(result, TRUE);
294 }
295 
296 U_NAMESPACE_END
297 
298 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
299