1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  filterednormalizer2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009dec10
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
25 #include "cpputils.h"
26 
27 U_NAMESPACE_BEGIN
28 
~FilteredNormalizer2()29 FilteredNormalizer2::~FilteredNormalizer2() {}
30 
31 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const32 FilteredNormalizer2::normalize(const UnicodeString &src,
33                                UnicodeString &dest,
34                                UErrorCode &errorCode) const {
35     uprv_checkCanGetBuffer(src, errorCode);
36     if(U_FAILURE(errorCode)) {
37         dest.setToBogus();
38         return dest;
39     }
40     if(&dest==&src) {
41         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
42         return dest;
43     }
44     dest.remove();
45     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
46 }
47 
48 // Internal: No argument checking, and appends to dest.
49 // Pass as input spanCondition the one that is likely to yield a non-zero
50 // span length at the start of src.
51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
52 // USET_SPAN_SIMPLE should be passed in for the start of src
53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
54 // an in-filter prefix.
55 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const56 FilteredNormalizer2::normalize(const UnicodeString &src,
57                                UnicodeString &dest,
58                                USetSpanCondition spanCondition,
59                                UErrorCode &errorCode) const {
60     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
61     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
62         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
63         int32_t spanLength=spanLimit-prevSpanLimit;
64         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
65             if(spanLength!=0) {
66                 dest.append(src, prevSpanLimit, spanLength);
67             }
68             spanCondition=USET_SPAN_SIMPLE;
69         } else {
70             if(spanLength!=0) {
71                 // Not norm2.normalizeSecondAndAppend() because we do not want
72                 // to modify the non-filter part of dest.
73                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
74                                             tempDest, errorCode));
75                 if(U_FAILURE(errorCode)) {
76                     break;
77                 }
78             }
79             spanCondition=USET_SPAN_NOT_CONTAINED;
80         }
81         prevSpanLimit=spanLimit;
82     }
83     return dest;
84 }
85 
86 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
88                                               const UnicodeString &second,
89                                               UErrorCode &errorCode) const {
90     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
91 }
92 
93 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const94 FilteredNormalizer2::append(UnicodeString &first,
95                             const UnicodeString &second,
96                             UErrorCode &errorCode) const {
97     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
98 }
99 
100 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
102                                               const UnicodeString &second,
103                                               UBool doNormalize,
104                                               UErrorCode &errorCode) const {
105     uprv_checkCanGetBuffer(first, errorCode);
106     uprv_checkCanGetBuffer(second, errorCode);
107     if(U_FAILURE(errorCode)) {
108         return first;
109     }
110     if(&first==&second) {
111         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
112         return first;
113     }
114     if(first.isEmpty()) {
115         if(doNormalize) {
116             return normalize(second, first, errorCode);
117         } else {
118             return first=second;
119         }
120     }
121     // merge the in-filter suffix of the first string with the in-filter prefix of the second
122     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
123     if(prefixLimit!=0) {
124         UnicodeString prefix(second.tempSubString(0, prefixLimit));
125         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
126         if(suffixStart==0) {
127             if(doNormalize) {
128                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
129             } else {
130                 norm2.append(first, prefix, errorCode);
131             }
132         } else {
133             UnicodeString middle(first, suffixStart, INT32_MAX);
134             if(doNormalize) {
135                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
136             } else {
137                 norm2.append(middle, prefix, errorCode);
138             }
139             first.replace(suffixStart, INT32_MAX, middle);
140         }
141     }
142     if(prefixLimit<second.length()) {
143         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
144         if(doNormalize) {
145             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
146         } else {
147             first.append(rest);
148         }
149     }
150     return first;
151 }
152 
153 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const154 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
155     return set.contains(c) && norm2.getDecomposition(c, decomposition);
156 }
157 
158 UBool
getRawDecomposition(UChar32 c,UnicodeString & decomposition) const159 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
160     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
161 }
162 
163 UChar32
composePair(UChar32 a,UChar32 b) const164 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
165     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
166 }
167 
168 uint8_t
getCombiningClass(UChar32 c) const169 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
170     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
171 }
172 
173 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const174 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
175     uprv_checkCanGetBuffer(s, errorCode);
176     if(U_FAILURE(errorCode)) {
177         return FALSE;
178     }
179     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
180     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
181         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
182         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
183             spanCondition=USET_SPAN_SIMPLE;
184         } else {
185             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
186                 U_FAILURE(errorCode)
187             ) {
188                 return FALSE;
189             }
190             spanCondition=USET_SPAN_NOT_CONTAINED;
191         }
192         prevSpanLimit=spanLimit;
193     }
194     return TRUE;
195 }
196 
197 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const198 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
199     uprv_checkCanGetBuffer(s, errorCode);
200     if(U_FAILURE(errorCode)) {
201         return UNORM_MAYBE;
202     }
203     UNormalizationCheckResult result=UNORM_YES;
204     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
205     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
206         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
207         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
208             spanCondition=USET_SPAN_SIMPLE;
209         } else {
210             UNormalizationCheckResult qcResult=
211                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
212             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
213                 return qcResult;
214             } else if(qcResult==UNORM_MAYBE) {
215                 result=qcResult;
216             }
217             spanCondition=USET_SPAN_NOT_CONTAINED;
218         }
219         prevSpanLimit=spanLimit;
220     }
221     return result;
222 }
223 
224 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
226     uprv_checkCanGetBuffer(s, errorCode);
227     if(U_FAILURE(errorCode)) {
228         return 0;
229     }
230     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
231     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
232         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
233         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
234             spanCondition=USET_SPAN_SIMPLE;
235         } else {
236             int32_t yesLimit=
237                 prevSpanLimit+
238                 norm2.spanQuickCheckYes(
239                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
240             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
241                 return yesLimit;
242             }
243             spanCondition=USET_SPAN_NOT_CONTAINED;
244         }
245         prevSpanLimit=spanLimit;
246     }
247     return s.length();
248 }
249 
250 UBool
hasBoundaryBefore(UChar32 c) const251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
252     return !set.contains(c) || norm2.hasBoundaryBefore(c);
253 }
254 
255 UBool
hasBoundaryAfter(UChar32 c) const256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
257     return !set.contains(c) || norm2.hasBoundaryAfter(c);
258 }
259 
260 UBool
isInert(UChar32 c) const261 FilteredNormalizer2::isInert(UChar32 c) const {
262     return !set.contains(c) || norm2.isInert(c);
263 }
264 
265 U_NAMESPACE_END
266 
267 // C API ------------------------------------------------------------------- ***
268 
269 U_NAMESPACE_USE
270 
271 U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)272 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
273     if(U_FAILURE(*pErrorCode)) {
274         return NULL;
275     }
276     if(filterSet==NULL) {
277         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
278         return NULL;
279     }
280     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
281                                              *UnicodeSet::fromUSet(filterSet));
282     if(fn2==NULL) {
283         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
284     }
285     return (UNormalizer2 *)fn2;
286 }
287 
288 #endif  // !UCONFIG_NO_NORMALIZATION
289