1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/17/99    aliu        Creation.
8 **********************************************************************
9 */
10 #ifndef RBT_RULE_H
11 #define RBT_RULE_H
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/uobject.h"
18 #include "unicode/unistr.h"
19 #include "unicode/utrans.h"
20 #include "unicode/unimatch.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 class Replaceable;
25 class TransliterationRuleData;
26 class StringMatcher;
27 class UnicodeFunctor;
28 
29 /**
30  * A transliteration rule used by
31  * <code>RuleBasedTransliterator</code>.
32  * <code>TransliterationRule</code> is an immutable object.
33  *
34  * <p>A rule consists of an input pattern and an output string.  When
35  * the input pattern is matched, the output string is emitted.  The
36  * input pattern consists of zero or more characters which are matched
37  * exactly (the key) and optional context.  Context must match if it
38  * is specified.  Context may be specified before the key, after the
39  * key, or both.  The key, preceding context, and following context
40  * may contain variables.  Variables represent a set of Unicode
41  * characters, such as the letters <i>a</i> through <i>z</i>.
42  * Variables are detected by looking up each character in a supplied
43  * variable list to see if it has been so defined.
44  *
45  * <p>A rule may contain segments in its input string and segment
46  * references in its output string.  A segment is a substring of the
47  * input pattern, indicated by an offset and limit.  The segment may
48  * be in the preceding or following context.  It may not span a
49  * context boundary.  A segment reference is a special character in
50  * the output string that causes a segment of the input string (not
51  * the input pattern) to be copied to the output string.  The range of
52  * special characters that represent segment references is defined by
53  * RuleBasedTransliterator.Data.
54  *
55  * @author Alan Liu
56  */
57 class TransliterationRule : public UMemory {
58 
59 private:
60 
61     // TODO Eliminate the pattern and keyLength data members.  They
62     // are used only by masks() and getIndexValue() which are called
63     // only during build time, not during run-time.  Perhaps these
64     // methods and pattern/keyLength can be isolated into a separate
65     // object.
66 
67     /**
68      * The match that must occur before the key, or null if there is no
69      * preceding context.
70      */
71     StringMatcher *anteContext;
72 
73     /**
74      * The matcher object for the key.  If null, then the key is empty.
75      */
76     StringMatcher *key;
77 
78     /**
79      * The match that must occur after the key, or null if there is no
80      * following context.
81      */
82     StringMatcher *postContext;
83 
84     /**
85      * The object that performs the replacement if the key,
86      * anteContext, and postContext are matched.  Never null.
87      */
88     UnicodeFunctor* output;
89 
90     /**
91      * The string that must be matched, consisting of the anteContext, key,
92      * and postContext, concatenated together, in that order.  Some components
93      * may be empty (zero length).
94      * @see anteContextLength
95      * @see keyLength
96      */
97     UnicodeString pattern;
98 
99     /**
100      * An array of matcher objects corresponding to the input pattern
101      * segments.  If there are no segments this is null.  N.B. This is
102      * a UnicodeMatcher for generality, but in practice it is always a
103      * StringMatcher.  In the future we may generalize this, but for
104      * now we sometimes cast down to StringMatcher.
105      *
106      * The array is owned, but the pointers within it are not.
107      */
108     UnicodeFunctor** segments;
109 
110     /**
111      * The number of elements in segments[] or zero if segments is NULL.
112      */
113     int32_t segmentsCount;
114 
115     /**
116      * The length of the string that must match before the key.  If
117      * zero, then there is no matching requirement before the key.
118      * Substring [0,anteContextLength) of pattern is the anteContext.
119      */
120     int32_t anteContextLength;
121 
122     /**
123      * The length of the key.  Substring [anteContextLength,
124      * anteContextLength + keyLength) is the key.
125 
126      */
127     int32_t keyLength;
128 
129     /**
130      * Miscellaneous attributes.
131      */
132     int8_t flags;
133 
134     /**
135      * Flag attributes.
136      */
137     enum {
138         ANCHOR_START = 1,
139         ANCHOR_END   = 2
140     };
141 
142     /**
143      * An alias pointer to the data for this rule.  The data provides
144      * lookup services for matchers and segments.
145      */
146     const TransliterationRuleData* data;
147 
148 public:
149 
150     /**
151      * Construct a new rule with the given input, output text, and other
152      * attributes.  A cursor position may be specified for the output text.
153      * @param input          input string, including key and optional ante and
154      *                       post context.
155      * @param anteContextPos offset into input to end of ante context, or -1 if
156      *                       none.  Must be <= input.length() if not -1.
157      * @param postContextPos offset into input to start of post context, or -1
158      *                       if none.  Must be <= input.length() if not -1, and must be >=
159      *                       anteContextPos.
160      * @param outputStr      output string.
161      * @param cursorPosition offset into output at which cursor is located, or -1 if
162      *                       none.  If less than zero, then the cursor is placed after the
163      *                       <code>output</code>; that is, -1 is equivalent to
164      *                       <code>output.length()</code>.  If greater than
165      *                       <code>output.length()</code> then an exception is thrown.
166      * @param cursorOffset   an offset to be added to cursorPos to position the
167      *                       cursor either in the ante context, if < 0, or in the post context, if >
168      *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
169      *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
170      *                       of -3.
171      * @param segs           array of UnicodeMatcher corresponding to input pattern
172      *                       segments, or null if there are none.  The array itself is adopted,
173      *                       but the pointers within it are not.
174      * @param segsCount      number of elements in segs[].
175      * @param anchorStart    TRUE if the the rule is anchored on the left to
176      *                       the context start.
177      * @param anchorEnd      TRUE if the rule is anchored on the right to the
178      *                       context limit.
179      * @param data           the rule data.
180      * @param status         Output parameter filled in with success or failure status.
181      */
182     TransliterationRule(const UnicodeString& input,
183                         int32_t anteContextPos, int32_t postContextPos,
184                         const UnicodeString& outputStr,
185                         int32_t cursorPosition, int32_t cursorOffset,
186                         UnicodeFunctor** segs,
187                         int32_t segsCount,
188                         UBool anchorStart, UBool anchorEnd,
189                         const TransliterationRuleData* data,
190                         UErrorCode& status);
191 
192     /**
193      * Copy constructor.
194      * @param other    the object to be copied.
195      */
196     TransliterationRule(TransliterationRule& other);
197 
198     /**
199      * Destructor.
200      */
201     virtual ~TransliterationRule();
202 
203     /**
204      * Change the data object that this rule belongs to.  Used
205      * internally by the TransliterationRuleData copy constructor.
206      * @param data    the new data value to be set.
207      */
208     void setData(const TransliterationRuleData* data);
209 
210     /**
211      * Return the preceding context length.  This method is needed to
212      * support the <code>Transliterator</code> method
213      * <code>getMaximumContextLength()</code>.  Internally, this is
214      * implemented as the anteContextLength, optionally plus one if
215      * there is a start anchor.  The one character anchor gap is
216      * needed to make repeated incremental transliteration with
217      * anchors work.
218      * @return    the preceding context length.
219      */
220     virtual int32_t getContextLength(void) const;
221 
222     /**
223      * Internal method.  Returns 8-bit index value for this rule.
224      * This is the low byte of the first character of the key,
225      * unless the first character of the key is a set.  If it's a
226      * set, or otherwise can match multiple keys, the index value is -1.
227      * @return    8-bit index value for this rule.
228      */
229     int16_t getIndexValue() const;
230 
231     /**
232      * Internal method.  Returns true if this rule matches the given
233      * index value.  The index value is an 8-bit integer, 0..255,
234      * representing the low byte of the first character of the key.
235      * It matches this rule if it matches the first character of the
236      * key, or if the first character of the key is a set, and the set
237      * contains any character with a low byte equal to the index
238      * value.  If the rule contains only ante context, as in foo)>bar,
239      * then it will match any key.
240      * @param v    the given index value.
241      * @return     true if this rule matches the given index value.
242      */
243     UBool matchesIndexValue(uint8_t v) const;
244 
245     /**
246      * Return true if this rule masks another rule.  If r1 masks r2 then
247      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
248      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
249      * "[c]a>x" masks "[dc]a>y".
250      * @param r2  the given rule to be compared with.
251      * @return    true if this rule masks 'r2'
252      */
253     virtual UBool masks(const TransliterationRule& r2) const;
254 
255     /**
256      * Attempt a match and replacement at the given position.  Return
257      * the degree of match between this rule and the given text.  The
258      * degree of match may be mismatch, a partial match, or a full
259      * match.  A mismatch means at least one character of the text
260      * does not match the context or key.  A partial match means some
261      * context and key characters match, but the text is not long
262      * enough to match all of them.  A full match means all context
263      * and key characters match.
264      *
265      * If a full match is obtained, perform a replacement, update pos,
266      * and return U_MATCH.  Otherwise both text and pos are unchanged.
267      *
268      * @param text the text
269      * @param pos the position indices
270      * @param incremental if TRUE, test for partial matches that may
271      * be completed by additional text inserted at pos.limit.
272      * @return one of <code>U_MISMATCH</code>,
273      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
274      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
275      */
276     UMatchDegree matchAndReplace(Replaceable& text,
277                                  UTransPosition& pos,
278                                  UBool incremental) const;
279 
280     /**
281      * Create a rule string that represents this rule object.  Append
282      * it to the given string.
283      */
284     virtual UnicodeString& toRule(UnicodeString& pat,
285                                   UBool escapeUnprintable) const;
286 
287     /**
288      * Union the set of all characters that may be modified by this rule
289      * into the given set.
290      */
291     void addSourceSetTo(UnicodeSet& toUnionTo) const;
292 
293     /**
294      * Union the set of all characters that may be emitted by this rule
295      * into the given set.
296      */
297     void addTargetSetTo(UnicodeSet& toUnionTo) const;
298 
299  private:
300 
301     friend class StringMatcher;
302 
303     TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
304 };
305 
306 U_NAMESPACE_END
307 
308 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
309 
310 #endif
311