1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (c) 1996-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * File unorm.cpp
9 *
10 * Created by: Vladimir Weinstein 12052000
11 *
12 * Modification history :
13 *
14 * Date        Name        Description
15 * 02/01/01    synwee      Added normalization quickcheck enum and method.
16 * 02/12/01    synwee      Commented out quickcheck util api has been approved
17 *                         Added private method for doing FCD checks
18 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
19 *                         string for codepoints < 0x300 for the normalization
20 *                         mode NFC.
21 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
22 *                         instead of just wrappers around normlzr.cpp,
23 *                         load unorm.dat, support Unicode 3.1 with
24 *                         supplementary code points, etc.
25 * 2009-nov..2010-jan  Markus Scherer  total rewrite, new Normalizer2 API & code
26 */
27 
28 #include "unicode/utypes.h"
29 
30 #if !UCONFIG_NO_NORMALIZATION
31 
32 #include "unicode/udata.h"
33 #include "unicode/ustring.h"
34 #include "unicode/uiter.h"
35 #include "unicode/unorm.h"
36 #include "unicode/unorm2.h"
37 #include "normalizer2impl.h"
38 #include "unormimp.h"
39 #include "uprops.h"
40 #include "ustr_imp.h"
41 
42 U_NAMESPACE_USE
43 
44 /* quick check functions ---------------------------------------------------- */
45 
46 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)47 unorm_quickCheck(const UChar *src,
48                  int32_t srcLength,
49                  UNormalizationMode mode,
50                  UErrorCode *pErrorCode) {
51     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52     return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 }
54 
55 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57                             UNormalizationMode mode, int32_t options,
58                             UErrorCode *pErrorCode) {
59     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60     if(options&UNORM_UNICODE_3_2) {
61         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62         return unorm2_quickCheck(
63             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64             src, srcLength, pErrorCode);
65     } else {
66         return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67     }
68 }
69 
70 U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)71 unorm_isNormalized(const UChar *src, int32_t srcLength,
72                    UNormalizationMode mode,
73                    UErrorCode *pErrorCode) {
74     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75     return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 }
77 
78 U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80                               UNormalizationMode mode, int32_t options,
81                               UErrorCode *pErrorCode) {
82     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83     if(options&UNORM_UNICODE_3_2) {
84         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85         return unorm2_isNormalized(
86             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87             src, srcLength, pErrorCode);
88     } else {
89         return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90     }
91 }
92 
93 /* normalize() API ---------------------------------------------------------- */
94 
95 /** Public API for normalizing. */
96 U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)97 unorm_normalize(const UChar *src, int32_t srcLength,
98                 UNormalizationMode mode, int32_t options,
99                 UChar *dest, int32_t destCapacity,
100                 UErrorCode *pErrorCode) {
101     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102     if(options&UNORM_UNICODE_3_2) {
103         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104         return unorm2_normalize(
105             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106             src, srcLength, dest, destCapacity, pErrorCode);
107     } else {
108         return unorm2_normalize((const UNormalizer2 *)n2,
109             src, srcLength, dest, destCapacity, pErrorCode);
110     }
111 }
112 
113 
114 /* iteration functions ------------------------------------------------------ */
115 
116 static int32_t
_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,const Normalizer2 * n2,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)117 _iterate(UCharIterator *src, UBool forward,
118               UChar *dest, int32_t destCapacity,
119               const Normalizer2 *n2,
120               UBool doNormalize, UBool *pNeededToNormalize,
121               UErrorCode *pErrorCode) {
122     if(U_FAILURE(*pErrorCode)) {
123         return 0;
124     }
125     if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) {
126         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127         return 0;
128     }
129 
130     if(pNeededToNormalize!=NULL) {
131         *pNeededToNormalize=FALSE;
132     }
133     if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
135     }
136 
137     UnicodeString buffer;
138     UChar32 c;
139     if(forward) {
140         /* get one character and ignore its properties */
141         buffer.append(uiter_next32(src));
142         /* get all following characters until we see a boundary */
143         while((c=uiter_next32(src))>=0) {
144             if(n2->hasBoundaryBefore(c)) {
145                 /* back out the latest movement to stop at the boundary */
146                 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147                 break;
148             } else {
149                 buffer.append(c);
150             }
151         }
152     } else {
153         while((c=uiter_previous32(src))>=0) {
154             /* always write this character to the front of the buffer */
155             buffer.insert(0, c);
156             /* stop if this just-copied character is a boundary */
157             if(n2->hasBoundaryBefore(c)) {
158                 break;
159             }
160         }
161     }
162 
163     UnicodeString destString(dest, 0, destCapacity);
164     if(buffer.length()>0 && doNormalize) {
165         n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
166         if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167             *pNeededToNormalize= destString!=buffer;
168         }
169         return destString.length();
170     } else {
171         /* just copy the source characters */
172         return buffer.extract(dest, destCapacity, *pErrorCode);
173     }
174 }
175 
176 static int32_t
unorm_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)177 unorm_iterate(UCharIterator *src, UBool forward,
178               UChar *dest, int32_t destCapacity,
179               UNormalizationMode mode, int32_t options,
180               UBool doNormalize, UBool *pNeededToNormalize,
181               UErrorCode *pErrorCode) {
182     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
183     if(options&UNORM_UNICODE_3_2) {
184         const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
185         if(U_FAILURE(*pErrorCode)) {
186             return 0;
187         }
188         FilteredNormalizer2 fn2(*n2, *uni32);
189         return _iterate(src, forward, dest, destCapacity,
190             &fn2, doNormalize, pNeededToNormalize, pErrorCode);
191     }
192     return _iterate(src, forward, dest, destCapacity,
193             n2, doNormalize, pNeededToNormalize, pErrorCode);
194 }
195 
196 U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)197 unorm_previous(UCharIterator *src,
198                UChar *dest, int32_t destCapacity,
199                UNormalizationMode mode, int32_t options,
200                UBool doNormalize, UBool *pNeededToNormalize,
201                UErrorCode *pErrorCode) {
202     return unorm_iterate(src, FALSE,
203                          dest, destCapacity,
204                          mode, options,
205                          doNormalize, pNeededToNormalize,
206                          pErrorCode);
207 }
208 
209 U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)210 unorm_next(UCharIterator *src,
211            UChar *dest, int32_t destCapacity,
212            UNormalizationMode mode, int32_t options,
213            UBool doNormalize, UBool *pNeededToNormalize,
214            UErrorCode *pErrorCode) {
215     return unorm_iterate(src, TRUE,
216                          dest, destCapacity,
217                          mode, options,
218                          doNormalize, pNeededToNormalize,
219                          pErrorCode);
220 }
221 
222 /* Concatenation of normalized strings -------------------------------------- */
223 
224 static int32_t
_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,const Normalizer2 * n2,UErrorCode * pErrorCode)225 _concatenate(const UChar *left, int32_t leftLength,
226                   const UChar *right, int32_t rightLength,
227                   UChar *dest, int32_t destCapacity,
228                   const Normalizer2 *n2,
229                   UErrorCode *pErrorCode) {
230     if(U_FAILURE(*pErrorCode)) {
231         return 0;
232     }
233     if(destCapacity<0 || (dest==NULL && destCapacity>0) ||
234         left==NULL || leftLength<-1 || right==NULL || rightLength<-1) {
235         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236         return 0;
237     }
238 
239     /* check for overlapping right and destination */
240     if( dest!=NULL &&
241         ((right>=dest && right<(dest+destCapacity)) ||
242          (rightLength>0 && dest>=right && dest<(right+rightLength)))
243     ) {
244         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245         return 0;
246     }
247 
248     /* allow left==dest */
249     UnicodeString destString;
250     if(left==dest) {
251         destString.setTo(dest, leftLength, destCapacity);
252     } else {
253         destString.setTo(dest, 0, destCapacity);
254         destString.append(left, leftLength);
255     }
256     return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
257            extract(dest, destCapacity, *pErrorCode);
258 }
259 
260 U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)261 unorm_concatenate(const UChar *left, int32_t leftLength,
262                   const UChar *right, int32_t rightLength,
263                   UChar *dest, int32_t destCapacity,
264                   UNormalizationMode mode, int32_t options,
265                   UErrorCode *pErrorCode) {
266     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
267     if(options&UNORM_UNICODE_3_2) {
268         const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
269         if(U_FAILURE(*pErrorCode)) {
270             return 0;
271         }
272         FilteredNormalizer2 fn2(*n2, *uni32);
273         return _concatenate(left, leftLength, right, rightLength,
274             dest, destCapacity, &fn2, pErrorCode);
275     }
276     return _concatenate(left, leftLength, right, rightLength,
277         dest, destCapacity, n2, pErrorCode);
278 }
279 
280 #endif /* #if !UCONFIG_NO_NORMALIZATION */
281