1 /*
2 **********************************************************************
3 *   Copyright (c) 2001-2012, International Business Machines Corporation
4 *   and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   07/23/01    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "strmatch.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19 #include "unicode/utf16.h"
20 
21 U_NAMESPACE_BEGIN
22 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
24 
25 StringMatcher::StringMatcher(const UnicodeString& theString,
26                              int32_t start,
27                              int32_t limit,
28                              int32_t segmentNum,
29                              const TransliterationRuleData& theData) :
30     data(&theData),
31     segmentNumber(segmentNum),
32     matchStart(-1),
33     matchLimit(-1)
34 {
35     theString.extractBetween(start, limit, pattern);
36 }
37 
StringMatcher(const StringMatcher & o)38 StringMatcher::StringMatcher(const StringMatcher& o) :
39     UnicodeFunctor(o),
40     UnicodeMatcher(o),
41     UnicodeReplacer(o),
42     pattern(o.pattern),
43     data(o.data),
44     segmentNumber(o.segmentNumber),
45     matchStart(o.matchStart),
46     matchLimit(o.matchLimit)
47 {
48 }
49 
50 /**
51  * Destructor
52  */
~StringMatcher()53 StringMatcher::~StringMatcher() {
54 }
55 
56 /**
57  * Implement UnicodeFunctor
58  */
clone() const59 UnicodeFunctor* StringMatcher::clone() const {
60     return new StringMatcher(*this);
61 }
62 
63 /**
64  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
65  * and return the pointer.
66  */
toMatcher() const67 UnicodeMatcher* StringMatcher::toMatcher() const {
68   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
69   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
70 
71   return nonconst_base;
72 }
73 
74 /**
75  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
76  * and return the pointer.
77  */
toReplacer() const78 UnicodeReplacer* StringMatcher::toReplacer() const {
79   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
80   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
81 
82   return nonconst_base;
83 }
84 
85 /**
86  * Implement UnicodeMatcher
87  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)88 UMatchDegree StringMatcher::matches(const Replaceable& text,
89                                     int32_t& offset,
90                                     int32_t limit,
91                                     UBool incremental) {
92     int32_t i;
93     int32_t cursor = offset;
94     if (limit < cursor) {
95         // Match in the reverse direction
96         for (i=pattern.length()-1; i>=0; --i) {
97             UChar keyChar = pattern.charAt(i);
98             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
99             if (subm == 0) {
100                 if (cursor > limit &&
101                     keyChar == text.charAt(cursor)) {
102                     --cursor;
103                 } else {
104                     return U_MISMATCH;
105                 }
106             } else {
107                 UMatchDegree m =
108                     subm->matches(text, cursor, limit, incremental);
109                 if (m != U_MATCH) {
110                     return m;
111                 }
112             }
113         }
114         // Record the match position, but adjust for a normal
115         // forward start, limit, and only if a prior match does not
116         // exist -- we want the rightmost match.
117         if (matchStart < 0) {
118             matchStart = cursor+1;
119             matchLimit = offset+1;
120         }
121     } else {
122         for (i=0; i<pattern.length(); ++i) {
123             if (incremental && cursor == limit) {
124                 // We've reached the context limit without a mismatch and
125                 // without completing our match.
126                 return U_PARTIAL_MATCH;
127             }
128             UChar keyChar = pattern.charAt(i);
129             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
130             if (subm == 0) {
131                 // Don't need the cursor < limit check if
132                 // incremental is TRUE (because it's done above); do need
133                 // it otherwise.
134                 if (cursor < limit &&
135                     keyChar == text.charAt(cursor)) {
136                     ++cursor;
137                 } else {
138                     return U_MISMATCH;
139                 }
140             } else {
141                 UMatchDegree m =
142                     subm->matches(text, cursor, limit, incremental);
143                 if (m != U_MATCH) {
144                     return m;
145                 }
146             }
147         }
148         // Record the match position
149         matchStart = offset;
150         matchLimit = cursor;
151     }
152 
153     offset = cursor;
154     return U_MATCH;
155 }
156 
157 /**
158  * Implement UnicodeMatcher
159  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
161                                         UBool escapeUnprintable) const
162 {
163     result.truncate(0);
164     UnicodeString str, quoteBuf;
165     if (segmentNumber > 0) {
166         result.append((UChar)40); /*(*/
167     }
168     for (int32_t i=0; i<pattern.length(); ++i) {
169         UChar keyChar = pattern.charAt(i);
170         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
171         if (m == 0) {
172             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
173         } else {
174             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
175                          TRUE, escapeUnprintable, quoteBuf);
176         }
177     }
178     if (segmentNumber > 0) {
179         result.append((UChar)41); /*)*/
180     }
181     // Flush quoteBuf out to result
182     ICU_Utility::appendToRule(result, -1,
183                               TRUE, escapeUnprintable, quoteBuf);
184     return result;
185 }
186 
187 /**
188  * Implement UnicodeMatcher
189  */
matchesIndexValue(uint8_t v) const190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
191     if (pattern.length() == 0) {
192         return TRUE;
193     }
194     UChar32 c = pattern.char32At(0);
195     const UnicodeMatcher *m = data->lookupMatcher(c);
196     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
197 }
198 
199 /**
200  * Implement UnicodeMatcher
201  */
addMatchSetTo(UnicodeSet & toUnionTo) const202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
203     UChar32 ch;
204     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
205         ch = pattern.char32At(i);
206         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
207         if (matcher == NULL) {
208             toUnionTo.add(ch);
209         } else {
210             matcher->addMatchSetTo(toUnionTo);
211         }
212     }
213 }
214 
215 /**
216  * UnicodeReplacer API
217  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)218 int32_t StringMatcher::replace(Replaceable& text,
219                                int32_t start,
220                                int32_t limit,
221                                int32_t& /*cursor*/) {
222 
223     int32_t outLen = 0;
224 
225     // Copy segment with out-of-band data
226     int32_t dest = limit;
227     // If there was no match, that means that a quantifier
228     // matched zero-length.  E.g., x (a)* y matched "xy".
229     if (matchStart >= 0) {
230         if (matchStart != matchLimit) {
231             text.copy(matchStart, matchLimit, dest);
232             outLen = matchLimit - matchStart;
233         }
234     }
235 
236     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
237 
238     return outLen;
239 }
240 
241 /**
242  * UnicodeReplacer API
243  */
toReplacerPattern(UnicodeString & rule,UBool) const244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
245                                                 UBool /*escapeUnprintable*/) const {
246     // assert(segmentNumber > 0);
247     rule.truncate(0);
248     rule.append((UChar)0x0024 /*$*/);
249     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
250     return rule;
251 }
252 
253 /**
254  * Remove any match info.  This must be called before performing a
255  * set of matches with this segment.
256  */
resetMatch()257  void StringMatcher::resetMatch() {
258     matchStart = matchLimit = -1;
259 }
260 
261 /**
262  * Union the set of all characters that may output by this object
263  * into the given set.
264  * @param toUnionTo the set into which to union the output characters
265  */
addReplacementSetTo(UnicodeSet &) const266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
267     // The output of this replacer varies; it is the source text between
268     // matchStart and matchLimit.  Since this varies depending on the
269     // input text, we can't compute it here.  We can either do nothing
270     // or we can add ALL characters to the set.  It's probably more useful
271     // to do nothing.
272 }
273 
274 /**
275  * Implement UnicodeFunctor
276  */
setData(const TransliterationRuleData * d)277 void StringMatcher::setData(const TransliterationRuleData* d) {
278     data = d;
279     int32_t i = 0;
280     while (i<pattern.length()) {
281         UChar32 c = pattern.char32At(i);
282         UnicodeFunctor* f = data->lookup(c);
283         if (f != NULL) {
284             f->setData(data);
285         }
286         i += U16_LENGTH(c);
287     }
288 }
289 
290 U_NAMESPACE_END
291 
292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
293 
294 //eof
295