1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  * Copyright (C) 2001-2011, International Business Machines Corporation
5  * and others. All Rights Reserved.
6  **********************************************************************
7  *   Date        Name        Description
8  *   07/23/01    aliu        Creation.
9  **********************************************************************
10  */
11 #ifndef STRMATCH_H
12 #define STRMATCH_H
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_TRANSLITERATION
17 
18 #include "unicode/unistr.h"
19 #include "unicode/unifunct.h"
20 #include "unicode/unimatch.h"
21 #include "unicode/unirepl.h"
22 
23 U_NAMESPACE_BEGIN
24 
25 class TransliterationRuleData;
26 
27 /**
28  * An object that matches a fixed input string, implementing the
29  * UnicodeMatcher API.  This object also implements the
30  * UnicodeReplacer API, allowing it to emit the matched text as
31  * output.  Since the match text may contain flexible match elements,
32  * such as UnicodeSets, the emitted text is not the match pattern, but
33  * instead a substring of the actual matched text.  Following
34  * convention, the output text is the leftmost match seen up to this
35  * point.
36  *
37  * A StringMatcher may represent a segment, in which case it has a
38  * positive segment number.  This affects how the matcher converts
39  * itself to a pattern but does not otherwise affect its function.
40  *
41  * A StringMatcher that is not a segment should not be used as a
42  * UnicodeReplacer.
43  */
44 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
45 
46  public:
47 
48     /**
49      * Construct a matcher that matches the given pattern string.
50      * @param string the pattern to be matched, possibly containing
51      * stand-ins that represent nested UnicodeMatcher objects.
52      * @param start inclusive start index of text to be replaced
53      * @param limit exclusive end index of text to be replaced;
54      * must be greater than or equal to start
55      * @param segmentNum the segment number from 1..n, or 0 if this is
56      * not a segment.
57      * @param data context object mapping stand-ins to
58      * UnicodeMatcher objects.
59      */
60     StringMatcher(const UnicodeString& string,
61                   int32_t start,
62                   int32_t limit,
63                   int32_t segmentNum,
64                   const TransliterationRuleData& data);
65 
66     /**
67      * Copy constructor
68      * @param o  the object to be copied.
69      */
70     StringMatcher(const StringMatcher& o);
71 
72     /**
73      * Destructor
74      */
75     virtual ~StringMatcher();
76 
77     /**
78      * Implement UnicodeFunctor
79      * @return a copy of the object.
80      */
81     virtual UnicodeFunctor* clone() const;
82 
83     /**
84      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
85      * and return the pointer.
86      * @return the UnicodeMatcher point.
87      */
88     virtual UnicodeMatcher* toMatcher() const;
89 
90     /**
91      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
92      * and return the pointer.
93      * @return the UnicodeReplacer pointer.
94      */
95     virtual UnicodeReplacer* toReplacer() const;
96 
97     /**
98      * Implement UnicodeMatcher
99      * @param text the text to be matched
100      * @param offset on input, the index into text at which to begin
101      * matching.  On output, the limit of the matched text.  The
102      * number of matched characters is the output value of offset
103      * minus the input value.  Offset should always point to the
104      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
105      * both on entry and upon return.
106      * @param limit the limit index of text to be matched.  Greater
107      * than offset for a forward direction match, less than offset for
108      * a backward direction match.  The last character to be
109      * considered for matching will be text.charAt(limit-1) in the
110      * forward direction or text.charAt(limit+1) in the backward
111      * direction.
112      * @param incremental  if TRUE, then assume further characters may
113      * be inserted at limit and check for partial matching.  Otherwise
114      * assume the text as given is complete.
115      * @return a match degree value indicating a full match, a partial
116      * match, or a mismatch.  If incremental is FALSE then
117      * U_PARTIAL_MATCH should never be returned.
118      */
119     virtual UMatchDegree matches(const Replaceable& text,
120                                  int32_t& offset,
121                                  int32_t limit,
122                                  UBool incremental);
123 
124     /**
125      * Implement UnicodeMatcher
126      * @param result            Output param to receive the pattern.
127      * @param escapeUnprintable if True then escape the unprintable characters.
128      * @return                  A reference to 'result'.
129      */
130     virtual UnicodeString& toPattern(UnicodeString& result,
131                                      UBool escapeUnprintable = FALSE) const;
132 
133     /**
134      * Implement UnicodeMatcher
135      * Returns TRUE if this matcher will match a character c, where c
136      * & 0xFF == v, at offset, in the forward direction (with limit >
137      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
138      * indexing.
139      * @param v    the given value
140      * @return     TRUE if this matcher will match a character c,
141      *             where c & 0xFF == v
142      */
143     virtual UBool matchesIndexValue(uint8_t v) const;
144 
145     /**
146      * Implement UnicodeMatcher
147      */
148     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
149 
150     /**
151      * Implement UnicodeFunctor
152      */
153     virtual void setData(const TransliterationRuleData*);
154 
155     /**
156      * Replace characters in 'text' from 'start' to 'limit' with the
157      * output text of this object.  Update the 'cursor' parameter to
158      * give the cursor position and return the length of the
159      * replacement text.
160      *
161      * @param text the text to be matched
162      * @param start inclusive start index of text to be replaced
163      * @param limit exclusive end index of text to be replaced;
164      * must be greater than or equal to start
165      * @param cursor output parameter for the cursor position.
166      * Not all replacer objects will update this, but in a complete
167      * tree of replacer objects, representing the entire output side
168      * of a transliteration rule, at least one must update it.
169      * @return the number of 16-bit code units in the text replacing
170      * the characters at offsets start..(limit-1) in text
171      */
172     virtual int32_t replace(Replaceable& text,
173                             int32_t start,
174                             int32_t limit,
175                             int32_t& cursor);
176 
177     /**
178      * Returns a string representation of this replacer.  If the
179      * result of calling this function is passed to the appropriate
180      * parser, typically TransliteratorParser, it will produce another
181      * replacer that is equal to this one.
182      * @param result the string to receive the pattern.  Previous
183      * contents will be deleted.
184      * @param escapeUnprintable if TRUE then convert unprintable
185      * character to their hex escape representations, \\uxxxx or
186      * \\Uxxxxxxxx.  Unprintable characters are defined by
187      * Utility.isUnprintable().
188      * @return a reference to 'result'.
189      */
190     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
191                                              UBool escapeUnprintable) const;
192 
193     /**
194      * Remove any match data.  This must be called before performing a
195      * set of matches with this segment.
196      */
197     void resetMatch();
198 
199     /**
200      * ICU "poor man's RTTI", returns a UClassID for the actual class.
201      */
202     virtual UClassID getDynamicClassID() const;
203 
204     /**
205      * ICU "poor man's RTTI", returns a UClassID for this class.
206      */
207     static UClassID U_EXPORT2 getStaticClassID();
208 
209     /**
210      * Union the set of all characters that may output by this object
211      * into the given set.
212      * @param toUnionTo the set into which to union the output characters
213      */
214     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
215 
216  private:
217 
218     /**
219      * The text to be matched.
220      */
221     UnicodeString pattern;
222 
223     /**
224      * Context object that maps stand-ins to matcher and replacer
225      * objects.
226      */
227     const TransliterationRuleData* data;
228 
229     /**
230      * The segment number, 1-based, or 0 if not a segment.
231      */
232     int32_t segmentNumber;
233 
234     /**
235      * Start offset, in the match text, of the <em>rightmost</em>
236      * match.
237      */
238     int32_t matchStart;
239 
240     /**
241      * Limit offset, in the match text, of the <em>rightmost</em>
242      * match.
243      */
244     int32_t matchLimit;
245 
246 };
247 
248 U_NAMESPACE_END
249 
250 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
251 
252 #endif
253