• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  filterednormalizer2.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009dec10
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_NORMALIZATION
22 
23 #include "unicode/normalizer2.h"
24 #include "unicode/uniset.h"
25 #include "unicode/unistr.h"
26 #include "unicode/unorm.h"
27 #include "cpputils.h"
28 
29 U_NAMESPACE_BEGIN
30 
~FilteredNormalizer2()31 FilteredNormalizer2::~FilteredNormalizer2() {}
32 
33 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const34 FilteredNormalizer2::normalize(const UnicodeString &src,
35                                UnicodeString &dest,
36                                UErrorCode &errorCode) const {
37     uprv_checkCanGetBuffer(src, errorCode);
38     if(U_FAILURE(errorCode)) {
39         dest.setToBogus();
40         return dest;
41     }
42     if(&dest==&src) {
43         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
44         return dest;
45     }
46     dest.remove();
47     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
48 }
49 
50 // Internal: No argument checking, and appends to dest.
51 // Pass as input spanCondition the one that is likely to yield a non-zero
52 // span length at the start of src.
53 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
54 // USET_SPAN_SIMPLE should be passed in for the start of src
55 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
56 // an in-filter prefix.
57 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const58 FilteredNormalizer2::normalize(const UnicodeString &src,
59                                UnicodeString &dest,
60                                USetSpanCondition spanCondition,
61                                UErrorCode &errorCode) const {
62     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
63     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
64         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
65         int32_t spanLength=spanLimit-prevSpanLimit;
66         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
67             if(spanLength!=0) {
68                 dest.append(src, prevSpanLimit, spanLength);
69             }
70             spanCondition=USET_SPAN_SIMPLE;
71         } else {
72             if(spanLength!=0) {
73                 // Not norm2.normalizeSecondAndAppend() because we do not want
74                 // to modify the non-filter part of dest.
75                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
76                                             tempDest, errorCode));
77                 if(U_FAILURE(errorCode)) {
78                     break;
79                 }
80             }
81             spanCondition=USET_SPAN_NOT_CONTAINED;
82         }
83         prevSpanLimit=spanLimit;
84     }
85     return dest;
86 }
87 
88 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const89 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
90                                               const UnicodeString &second,
91                                               UErrorCode &errorCode) const {
92     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
93 }
94 
95 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const96 FilteredNormalizer2::append(UnicodeString &first,
97                             const UnicodeString &second,
98                             UErrorCode &errorCode) const {
99     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
100 }
101 
102 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const103 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
104                                               const UnicodeString &second,
105                                               UBool doNormalize,
106                                               UErrorCode &errorCode) const {
107     uprv_checkCanGetBuffer(first, errorCode);
108     uprv_checkCanGetBuffer(second, errorCode);
109     if(U_FAILURE(errorCode)) {
110         return first;
111     }
112     if(&first==&second) {
113         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
114         return first;
115     }
116     if(first.isEmpty()) {
117         if(doNormalize) {
118             return normalize(second, first, errorCode);
119         } else {
120             return first=second;
121         }
122     }
123     // merge the in-filter suffix of the first string with the in-filter prefix of the second
124     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
125     if(prefixLimit!=0) {
126         UnicodeString prefix(second.tempSubString(0, prefixLimit));
127         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
128         if(suffixStart==0) {
129             if(doNormalize) {
130                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
131             } else {
132                 norm2.append(first, prefix, errorCode);
133             }
134         } else {
135             UnicodeString middle(first, suffixStart, INT32_MAX);
136             if(doNormalize) {
137                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
138             } else {
139                 norm2.append(middle, prefix, errorCode);
140             }
141             first.replace(suffixStart, INT32_MAX, middle);
142         }
143     }
144     if(prefixLimit<second.length()) {
145         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
146         if(doNormalize) {
147             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
148         } else {
149             first.append(rest);
150         }
151     }
152     return first;
153 }
154 
155 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const156 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
157     return set.contains(c) && norm2.getDecomposition(c, decomposition);
158 }
159 
160 UBool
getRawDecomposition(UChar32 c,UnicodeString & decomposition) const161 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
162     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
163 }
164 
165 UChar32
composePair(UChar32 a,UChar32 b) const166 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
167     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
168 }
169 
170 uint8_t
getCombiningClass(UChar32 c) const171 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
172     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
173 }
174 
175 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const176 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
177     uprv_checkCanGetBuffer(s, errorCode);
178     if(U_FAILURE(errorCode)) {
179         return FALSE;
180     }
181     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
182     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
183         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
184         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
185             spanCondition=USET_SPAN_SIMPLE;
186         } else {
187             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
188                 U_FAILURE(errorCode)
189             ) {
190                 return FALSE;
191             }
192             spanCondition=USET_SPAN_NOT_CONTAINED;
193         }
194         prevSpanLimit=spanLimit;
195     }
196     return TRUE;
197 }
198 
199 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const200 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
201     uprv_checkCanGetBuffer(s, errorCode);
202     if(U_FAILURE(errorCode)) {
203         return UNORM_MAYBE;
204     }
205     UNormalizationCheckResult result=UNORM_YES;
206     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
207     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
208         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
209         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
210             spanCondition=USET_SPAN_SIMPLE;
211         } else {
212             UNormalizationCheckResult qcResult=
213                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
214             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
215                 return qcResult;
216             } else if(qcResult==UNORM_MAYBE) {
217                 result=qcResult;
218             }
219             spanCondition=USET_SPAN_NOT_CONTAINED;
220         }
221         prevSpanLimit=spanLimit;
222     }
223     return result;
224 }
225 
226 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const227 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
228     uprv_checkCanGetBuffer(s, errorCode);
229     if(U_FAILURE(errorCode)) {
230         return 0;
231     }
232     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
233     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
234         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
235         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
236             spanCondition=USET_SPAN_SIMPLE;
237         } else {
238             int32_t yesLimit=
239                 prevSpanLimit+
240                 norm2.spanQuickCheckYes(
241                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
242             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
243                 return yesLimit;
244             }
245             spanCondition=USET_SPAN_NOT_CONTAINED;
246         }
247         prevSpanLimit=spanLimit;
248     }
249     return s.length();
250 }
251 
252 UBool
hasBoundaryBefore(UChar32 c) const253 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
254     return !set.contains(c) || norm2.hasBoundaryBefore(c);
255 }
256 
257 UBool
hasBoundaryAfter(UChar32 c) const258 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
259     return !set.contains(c) || norm2.hasBoundaryAfter(c);
260 }
261 
262 UBool
isInert(UChar32 c) const263 FilteredNormalizer2::isInert(UChar32 c) const {
264     return !set.contains(c) || norm2.isInert(c);
265 }
266 
267 U_NAMESPACE_END
268 
269 // C API ------------------------------------------------------------------- ***
270 
271 U_NAMESPACE_USE
272 
273 U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)274 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
275     if(U_FAILURE(*pErrorCode)) {
276         return NULL;
277     }
278     if(filterSet==NULL) {
279         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
280         return NULL;
281     }
282     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
283                                              *UnicodeSet::fromUSet(filterSet));
284     if(fn2==NULL) {
285         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
286     }
287     return (UNormalizer2 *)fn2;
288 }
289 
290 #endif  // !UCONFIG_NO_NORMALIZATION
291