1 /*
2  * Copyright (C) 2001-2011, International Business Machines Corporation
3  * and others. All Rights Reserved.
4  **********************************************************************
5  *   Date        Name        Description
6  *   07/23/01    aliu        Creation.
7  **********************************************************************
8  */
9 #ifndef STRMATCH_H
10 #define STRMATCH_H
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_TRANSLITERATION
15 
16 #include "unicode/unistr.h"
17 #include "unicode/unifunct.h"
18 #include "unicode/unimatch.h"
19 #include "unicode/unirepl.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 class TransliterationRuleData;
24 
25 /**
26  * An object that matches a fixed input string, implementing the
27  * UnicodeMatcher API.  This object also implements the
28  * UnicodeReplacer API, allowing it to emit the matched text as
29  * output.  Since the match text may contain flexible match elements,
30  * such as UnicodeSets, the emitted text is not the match pattern, but
31  * instead a substring of the actual matched text.  Following
32  * convention, the output text is the leftmost match seen up to this
33  * point.
34  *
35  * A StringMatcher may represent a segment, in which case it has a
36  * positive segment number.  This affects how the matcher converts
37  * itself to a pattern but does not otherwise affect its function.
38  *
39  * A StringMatcher that is not a segment should not be used as a
40  * UnicodeReplacer.
41  */
42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
43 
44  public:
45 
46     /**
47      * Construct a matcher that matches the given pattern string.
48      * @param string the pattern to be matched, possibly containing
49      * stand-ins that represent nested UnicodeMatcher objects.
50      * @param start inclusive start index of text to be replaced
51      * @param limit exclusive end index of text to be replaced;
52      * must be greater than or equal to start
53      * @param segmentNum the segment number from 1..n, or 0 if this is
54      * not a segment.
55      * @param data context object mapping stand-ins to
56      * UnicodeMatcher objects.
57      */
58     StringMatcher(const UnicodeString& string,
59                   int32_t start,
60                   int32_t limit,
61                   int32_t segmentNum,
62                   const TransliterationRuleData& data);
63 
64     /**
65      * Copy constructor
66      * @param o  the object to be copied.
67      */
68     StringMatcher(const StringMatcher& o);
69 
70     /**
71      * Destructor
72      */
73     virtual ~StringMatcher();
74 
75     /**
76      * Implement UnicodeFunctor
77      * @return a copy of the object.
78      */
79     virtual UnicodeFunctor* clone() const;
80 
81     /**
82      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
83      * and return the pointer.
84      * @return the UnicodeMatcher point.
85      */
86     virtual UnicodeMatcher* toMatcher() const;
87 
88     /**
89      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
90      * and return the pointer.
91      * @return the UnicodeReplacer pointer.
92      */
93     virtual UnicodeReplacer* toReplacer() const;
94 
95     /**
96      * Implement UnicodeMatcher
97      * @param text the text to be matched
98      * @param offset on input, the index into text at which to begin
99      * matching.  On output, the limit of the matched text.  The
100      * number of matched characters is the output value of offset
101      * minus the input value.  Offset should always point to the
102      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
103      * both on entry and upon return.
104      * @param limit the limit index of text to be matched.  Greater
105      * than offset for a forward direction match, less than offset for
106      * a backward direction match.  The last character to be
107      * considered for matching will be text.charAt(limit-1) in the
108      * forward direction or text.charAt(limit+1) in the backward
109      * direction.
110      * @param incremental  if TRUE, then assume further characters may
111      * be inserted at limit and check for partial matching.  Otherwise
112      * assume the text as given is complete.
113      * @return a match degree value indicating a full match, a partial
114      * match, or a mismatch.  If incremental is FALSE then
115      * U_PARTIAL_MATCH should never be returned.
116      */
117     virtual UMatchDegree matches(const Replaceable& text,
118                                  int32_t& offset,
119                                  int32_t limit,
120                                  UBool incremental);
121 
122     /**
123      * Implement UnicodeMatcher
124      * @param result            Output param to receive the pattern.
125      * @param escapeUnprintable if True then escape the unprintable characters.
126      * @return                  A reference to 'result'.
127      */
128     virtual UnicodeString& toPattern(UnicodeString& result,
129                                      UBool escapeUnprintable = FALSE) const;
130 
131     /**
132      * Implement UnicodeMatcher
133      * Returns TRUE if this matcher will match a character c, where c
134      * & 0xFF == v, at offset, in the forward direction (with limit >
135      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
136      * indexing.
137      * @param v    the given value
138      * @return     TRUE if this matcher will match a character c,
139      *             where c & 0xFF == v
140      */
141     virtual UBool matchesIndexValue(uint8_t v) const;
142 
143     /**
144      * Implement UnicodeMatcher
145      */
146     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
147 
148     /**
149      * Implement UnicodeFunctor
150      */
151     virtual void setData(const TransliterationRuleData*);
152 
153     /**
154      * Replace characters in 'text' from 'start' to 'limit' with the
155      * output text of this object.  Update the 'cursor' parameter to
156      * give the cursor position and return the length of the
157      * replacement text.
158      *
159      * @param text the text to be matched
160      * @param start inclusive start index of text to be replaced
161      * @param limit exclusive end index of text to be replaced;
162      * must be greater than or equal to start
163      * @param cursor output parameter for the cursor position.
164      * Not all replacer objects will update this, but in a complete
165      * tree of replacer objects, representing the entire output side
166      * of a transliteration rule, at least one must update it.
167      * @return the number of 16-bit code units in the text replacing
168      * the characters at offsets start..(limit-1) in text
169      */
170     virtual int32_t replace(Replaceable& text,
171                             int32_t start,
172                             int32_t limit,
173                             int32_t& cursor);
174 
175     /**
176      * Returns a string representation of this replacer.  If the
177      * result of calling this function is passed to the appropriate
178      * parser, typically TransliteratorParser, it will produce another
179      * replacer that is equal to this one.
180      * @param result the string to receive the pattern.  Previous
181      * contents will be deleted.
182      * @param escapeUnprintable if TRUE then convert unprintable
183      * character to their hex escape representations, \\uxxxx or
184      * \\Uxxxxxxxx.  Unprintable characters are defined by
185      * Utility.isUnprintable().
186      * @return a reference to 'result'.
187      */
188     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
189                                              UBool escapeUnprintable) const;
190 
191     /**
192      * Remove any match data.  This must be called before performing a
193      * set of matches with this segment.
194      */
195     void resetMatch();
196 
197     /**
198      * ICU "poor man's RTTI", returns a UClassID for the actual class.
199      */
200     virtual UClassID getDynamicClassID() const;
201 
202     /**
203      * ICU "poor man's RTTI", returns a UClassID for this class.
204      */
205     static UClassID U_EXPORT2 getStaticClassID();
206 
207     /**
208      * Union the set of all characters that may output by this object
209      * into the given set.
210      * @param toUnionTo the set into which to union the output characters
211      */
212     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
213 
214  private:
215 
216     /**
217      * The text to be matched.
218      */
219     UnicodeString pattern;
220 
221     /**
222      * Context object that maps stand-ins to matcher and replacer
223      * objects.
224      */
225     const TransliterationRuleData* data;
226 
227     /**
228      * The segment number, 1-based, or 0 if not a segment.
229      */
230     int32_t segmentNumber;
231 
232     /**
233      * Start offset, in the match text, of the <em>rightmost</em>
234      * match.
235      */
236     int32_t matchStart;
237 
238     /**
239      * Limit offset, in the match text, of the <em>rightmost</em>
240      * match.
241      */
242     int32_t matchLimit;
243 
244 };
245 
246 U_NAMESPACE_END
247 
248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
249 
250 #endif
251