1 /*
2 **********************************************************************
3 *   Copyright (C) 1999-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/17/99    aliu        Creation.
8 **********************************************************************
9 */
10 #ifndef RBT_H
11 #define RBT_H
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/translit.h"
18 #include "unicode/utypes.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/udata.h"
21 
22 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit"
23 
24 U_NAMESPACE_BEGIN
25 
26 class TransliterationRuleData;
27 
28 /**
29  * <code>RuleBasedTransliterator</code> is a transliterator
30  * that reads a set of rules in order to determine how to perform
31  * translations. Rule sets are stored in resource bundles indexed by
32  * name. Rules within a rule set are separated by semicolons (';').
33  * To include a literal semicolon, prefix it with a backslash ('\').
34  * Whitespace, as defined by <code>Character.isWhitespace()</code>,
35  * is ignored. If the first non-blank character on a line is '#',
36  * the entire line is ignored as a comment. </p>
37  *
38  * <p>Each set of rules consists of two groups, one forward, and one
39  * reverse. This is a convention that is not enforced; rules for one
40  * direction may be omitted, with the result that translations in
41  * that direction will not modify the source text. In addition,
42  * bidirectional forward-reverse rules may be specified for
43  * symmetrical transformations.</p>
44  *
45  * <p><b>Rule syntax</b> </p>
46  *
47  * <p>Rule statements take one of the following forms: </p>
48  *
49  * <dl>
50  *     <dt><code>$alefmadda=\u0622;</code></dt>
51  *     <dd><strong>Variable definition.</strong> The name on the
52  *         left is assigned the text on the right. In this example,
53  *         after this statement, instances of the left hand name,
54  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
55  *         the Unicode character U+0622. Variable names must begin
56  *         with a letter and consist only of letters, digits, and
57  *         underscores. Case is significant. Duplicate names cause
58  *         an exception to be thrown, that is, variables cannot be
59  *         redefined. The right hand side may contain well-formed
60  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
61  *         The right hand side may contain embedded <code>UnicodeSet</code>
62  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
63  *     <dd>&nbsp;</dd>
64  *     <dt><code>ai&gt;$alefmadda;</code></dt>
65  *     <dd><strong>Forward translation rule.</strong> This rule
66  *         states that the string on the left will be changed to the
67  *         string on the right when performing forward
68  *         transliteration.</dd>
69  *     <dt>&nbsp;</dt>
70  *     <dt><code>ai<$alefmadda;</code></dt>
71  *     <dd><strong>Reverse translation rule.</strong> This rule
72  *         states that the string on the right will be changed to
73  *         the string on the left when performing reverse
74  *         transliteration.</dd>
75  * </dl>
76  *
77  * <dl>
78  *     <dt><code>ai<>$alefmadda;</code></dt>
79  *     <dd><strong>Bidirectional translation rule.</strong> This
80  *         rule states that the string on the right will be changed
81  *         to the string on the left when performing forward
82  *         transliteration, and vice versa when performing reverse
83  *         transliteration.</dd>
84  * </dl>
85  *
86  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
87  * string</em>. The match pattern consists of literal characters,
88  * optionally preceded by context, and optionally followed by
89  * context. Context characters, like literal pattern characters,
90  * must be matched in the text being transliterated. However, unlike
91  * literal pattern characters, they are not replaced by the output
92  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
93  * indicates the characters &quot;<code>def</code>&quot; must be
94  * preceded by &quot;<code>abc</code>&quot; for a successful match.
95  * If there is a successful match, &quot;<code>def</code>&quot; will
96  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
97  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
98  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
99  * (or &quot;<code>123}456</code>&quot;) in which the literal
100  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
101  * </p>
102  *
103  * <p>The output string of a forward or reverse rule consists of
104  * characters to replace the literal pattern characters. If the
105  * output string contains the character '<code>|</code>', this is
106  * taken to indicate the location of the <em>cursor</em> after
107  * replacement. The cursor is the point in the text at which the
108  * next replacement, if any, will be applied. The cursor is usually
109  * placed within the replacement text; however, it can actually be
110  * placed into the precending or following context by using the
111  * special character '<code>@</code>'. Examples:</p>
112  *
113  * <blockquote>
114  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
115  *     before a<br>
116  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
117  *     y and z</code></p>
118  * </blockquote>
119  *
120  * <p><b>UnicodeSet</b></p>
121  *
122  * <p><code>UnicodeSet</code> patterns may appear anywhere that
123  * makes sense. They may appear in variable definitions.
124  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
125  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
126  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
127  *
128  * <p><code>UnicodeSet</code> patterns may also be embedded directly
129  * into rule strings. Thus, the following two rules are equivalent:</p>
130  *
131  * <blockquote>
132  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
133  *     [aeiou]&gt;'*';
134  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
135  *     Another way</code></p>
136  * </blockquote>
137  *
138  * <p>See {@link UnicodeSet} for more documentation and examples.</p>
139  *
140  * <p><b>Segments</b></p>
141  *
142  * <p>Segments of the input string can be matched and copied to the
143  * output string. This makes certain sets of rules simpler and more
144  * general, and makes reordering possible. For example:</p>
145  *
146  * <blockquote>
147  *     <p><code>([a-z]) &gt; $1 $1;
148  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
149  *     double lowercase letters<br>
150  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
151  * </blockquote>
152  *
153  * <p>The segment of the input string to be copied is delimited by
154  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
155  * nine segments may be defined. Segments may not overlap. In the
156  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
157  * represent the input string segments, in left-to-right order of
158  * definition.</p>
159  *
160  * <p><b>Anchors</b></p>
161  *
162  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
163  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
164  *
165  * <blockquote>
166  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
167  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
168  *   of 'a'<br>
169  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>
170  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
171  *   of 'z'</code></p>
172  * </blockquote>
173  *
174  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
175  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
176  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
177  * match either the beginning or the end of the text, depending on its placement. For
178  * example:</p>
179  *
180  * <blockquote>
181  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
182  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
183  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
184  * </blockquote>
185  *
186  * <p><b>Example</b> </p>
187  *
188  * <p>The following example rules illustrate many of the features of
189  * the rule language. </p>
190  *
191  * <table border="0" cellpadding="4">
192  *     <tr>
193  *         <td valign="top">Rule 1.</td>
194  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>
195  *     </tr>
196  *     <tr>
197  *         <td valign="top">Rule 2.</td>
198  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>
199  *     </tr>
200  *     <tr>
201  *         <td valign="top">Rule 3.</td>
202  *         <td valign="top" nowrap><code>yz&gt;q</code></td>
203  *     </tr>
204  * </table>
205  *
206  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
207  * yields the following results: </p>
208  *
209  * <table border="0" cellpadding="4">
210  *     <tr>
211  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>
212  *         <td valign="top">Initial state, no rules match. Advance
213  *         cursor.</td>
214  *     </tr>
215  *     <tr>
216  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>
217  *         <td valign="top">Still no match. Rule 1 does not match
218  *         because the preceding context is not present.</td>
219  *     </tr>
220  *     <tr>
221  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>
222  *         <td valign="top">Still no match. Keep advancing until
223  *         there is a match...</td>
224  *     </tr>
225  *     <tr>
226  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>
227  *         <td valign="top">...</td>
228  *     </tr>
229  *     <tr>
230  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>
231  *         <td valign="top">...</td>
232  *     </tr>
233  *     <tr>
234  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>
235  *         <td valign="top">...</td>
236  *     </tr>
237  *     <tr>
238  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>
239  *         <td valign="top">...</td>
240  *     </tr>
241  *     <tr>
242  *         <td valign="top" nowrap><code>adefabc|defz</code></td>
243  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;
244  *         with &quot;<code>xy</code>&quot; and back up the cursor
245  *         to before the '<code>y</code>'.</td>
246  *     </tr>
247  *     <tr>
248  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>
249  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is
250  *         present, rule 2 does not match because the cursor is
251  *         before the '<code>y</code>', not before the '<code>x</code>'.
252  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
253  *         with &quot;<code>q</code>&quot;.</td>
254  *     </tr>
255  *     <tr>
256  *         <td valign="top" nowrap><code>adefabcxq|</code></td>
257  *         <td valign="top">The cursor is at the end;
258  *         transliteration is complete.</td>
259  *     </tr>
260  * </table>
261  *
262  * <p>The order of rules is significant. If multiple rules may match
263  * at some point, the first matching rule is applied. </p>
264  *
265  * <p>Forward and reverse rules may have an empty output string.
266  * Otherwise, an empty left or right hand side of any statement is a
267  * syntax error. </p>
268  *
269  * <p>Single quotes are used to quote any character other than a
270  * digit or letter. To specify a single quote itself, inside or
271  * outside of quotes, use two single quotes in a row. For example,
272  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
273  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
274  * </p>
275  *
276  * <p><b>Notes</b> </p>
277  *
278  * <p>While a RuleBasedTransliterator is being built, it checks that
279  * the rules are added in proper order. For example, if the rule
280  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
281  * then the second rule will throw an exception. The reason is that
282  * the second rule can never be triggered, since the first rule
283  * always matches anything it matches. In other words, the first
284  * rule <em>masks</em> the second rule. </p>
285  *
286  * @author Alan Liu
287  * @internal Use transliterator factory methods instead since this class will be removed in that release.
288  */
289 class RuleBasedTransliterator : public Transliterator {
290 private:
291     /**
292      * The data object is immutable, so we can freely share it with
293      * other instances of RBT, as long as we do NOT own this object.
294      *  TODO:  data is no longer immutable.  See bugs #1866, 2155
295      */
296     TransliterationRuleData* fData;
297 
298     /**
299      * If true, we own the data object and must delete it.
300      */
301     UBool isDataOwned;
302 
303 public:
304 
305     /**
306      * Constructs a new transliterator from the given rules.
307      * @param rules rules, separated by ';'
308      * @param direction either FORWARD or REVERSE.
309      * @exception IllegalArgumentException if rules are malformed.
310      * @internal Use transliterator factory methods instead since this class will be removed in that release.
311      */
312     RuleBasedTransliterator(const UnicodeString& id,
313                             const UnicodeString& rules,
314                             UTransDirection direction,
315                             UnicodeFilter* adoptedFilter,
316                             UParseError& parseError,
317                             UErrorCode& status);
318 
319     /**
320      * Constructs a new transliterator from the given rules.
321      * @param rules rules, separated by ';'
322      * @param direction either FORWARD or REVERSE.
323      * @exception IllegalArgumentException if rules are malformed.
324      * @internal Use transliterator factory methods instead since this class will be removed in that release.
325      */
326     /*RuleBasedTransliterator(const UnicodeString& id,
327                             const UnicodeString& rules,
328                             UTransDirection direction,
329                             UnicodeFilter* adoptedFilter,
330                             UErrorCode& status);*/
331 
332     /**
333      * Covenience constructor with no filter.
334      * @internal Use transliterator factory methods instead since this class will be removed in that release.
335      */
336     /*RuleBasedTransliterator(const UnicodeString& id,
337                             const UnicodeString& rules,
338                             UTransDirection direction,
339                             UErrorCode& status);*/
340 
341     /**
342      * Covenience constructor with no filter and FORWARD direction.
343      * @internal Use transliterator factory methods instead since this class will be removed in that release.
344      */
345     /*RuleBasedTransliterator(const UnicodeString& id,
346                             const UnicodeString& rules,
347                             UErrorCode& status);*/
348 
349     /**
350      * Covenience constructor with FORWARD direction.
351      * @internal Use transliterator factory methods instead since this class will be removed in that release.
352      */
353     /*RuleBasedTransliterator(const UnicodeString& id,
354                             const UnicodeString& rules,
355                             UnicodeFilter* adoptedFilter,
356                             UErrorCode& status);*/
357 private:
358 
359      friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor
360     /**
361      * Covenience constructor.
362      * @param id            the id for the transliterator.
363      * @param theData       the rule data for the transliterator.
364      * @param adoptedFilter the filter for the transliterator
365      */
366     RuleBasedTransliterator(const UnicodeString& id,
367                             const TransliterationRuleData* theData,
368                             UnicodeFilter* adoptedFilter = 0);
369 
370 
371     friend class Transliterator; // to access following ct
372 
373     /**
374      * Internal constructor.
375      * @param id            the id for the transliterator.
376      * @param theData       the rule data for the transliterator.
377      * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.
378      */
379     RuleBasedTransliterator(const UnicodeString& id,
380                             TransliterationRuleData* data,
381                             UBool isDataAdopted);
382 
383 public:
384 
385     /**
386      * Copy constructor.
387      * @internal Use transliterator factory methods instead since this class will be removed in that release.
388      */
389     RuleBasedTransliterator(const RuleBasedTransliterator&);
390 
391     virtual ~RuleBasedTransliterator();
392 
393     /**
394      * Implement Transliterator API.
395      * @internal Use transliterator factory methods instead since this class will be removed in that release.
396      */
397     virtual Transliterator* clone(void) const;
398 
399 protected:
400     /**
401      * Implements {@link Transliterator#handleTransliterate}.
402      * @internal Use transliterator factory methods instead since this class will be removed in that release.
403      */
404     virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
405                                      UBool isIncremental) const;
406 
407 public:
408     /**
409      * Return a representation of this transliterator as source rules.
410      * These rules will produce an equivalent transliterator if used
411      * to construct a new transliterator.
412      * @param result the string to receive the rules.  Previous
413      * contents will be deleted.
414      * @param escapeUnprintable if TRUE then convert unprintable
415      * character to their hex escape representations, \uxxxx or
416      * \Uxxxxxxxx.  Unprintable characters are those other than
417      * U+000A, U+0020..U+007E.
418      * @internal Use transliterator factory methods instead since this class will be removed in that release.
419      */
420     virtual UnicodeString& toRules(UnicodeString& result,
421                                    UBool escapeUnprintable) const;
422 
423 protected:
424     /**
425      * Implement Transliterator framework
426      */
427     virtual void handleGetSourceSet(UnicodeSet& result) const;
428 
429 public:
430     /**
431      * Override Transliterator framework
432      */
433     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
434 
435     /**
436      * Return the class ID for this class.  This is useful only for
437      * comparing to a return value from getDynamicClassID().  For example:
438      * <pre>
439      * .      Base* polymorphic_pointer = createPolymorphicObject();
440      * .      if (polymorphic_pointer->getDynamicClassID() ==
441      * .          Derived::getStaticClassID()) ...
442      * </pre>
443      * @return          The class ID for all objects of this class.
444      * @internal Use transliterator factory methods instead since this class will be removed in that release.
445      */
446     U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void);
447 
448     /**
449      * Returns a unique class ID <b>polymorphically</b>.  This method
450      * is to implement a simple version of RTTI, since not all C++
451      * compilers support genuine RTTI.  Polymorphic operator==() and
452      * clone() methods call this method.
453      *
454      * @return The class ID for this object. All objects of a given
455      * class have the same class ID.  Objects of other classes have
456      * different class IDs.
457      */
458     virtual UClassID getDynamicClassID(void) const;
459 
460 private:
461 
462     void _construct(const UnicodeString& rules,
463                     UTransDirection direction,
464                     UParseError& parseError,
465                     UErrorCode& status);
466 };
467 
468 
469 U_NAMESPACE_END
470 
471 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
472 
473 #endif
474